Data Visualization#
Load data#
Show code cell source
import pandas as pd
import sys
sys.path.append('../')
from utils.plots import *
output_notebook()
file_path = '../data/'
model_name = 'AML Epigenomic Risk'
# Read the data
df = pd.read_excel(file_path + 'alma_main_results.xlsx', index_col=0).sort_index()
sig_results = pd.read_excel(file_path + 'signature_results.xlsx', index_col=0).sort_index()
df = df.join(sig_results)
# Define train and test samples
df_train = df[df['Train-Test']=='Train Sample']
df_test = df[df['Train-Test'] == 'Test Sample']
# remove duplicates from the test cohort
df_test = df_test[~df_test['Patient_ID'].duplicated(keep='last')]
df_test = df_test[~df_test['Clinical Trial'].isin(['UF HemBank'])]
# Prognostic model samples
df_px = df[~df['Vital Status at 5y'].isna()]
df_px2 = df_px[df_px['Clinical Trial'].isin(['AAML0531', 'AAML1031', 'AAML03P1'])]
df_px2 = df_px2[df_px2['Sample Type'].isin(
['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Primary Blood Derived Cancer - Peripheral Blood'])]
df_px2 = df_px2[~df_px2['Patient_ID'].duplicated(keep='last')]
# drop the samples with missing labels for the ELN AML 2022 Diagnosis
df_dx = df_train[~df_train['WHO 2022 Diagnosis'].isna()]
# exclude the classes with fewer than 5 samples
df_dx = df_dx[~df_dx['WHO 2022 Diagnosis'].isin(['AML with t(9;22); BCR::ABL1'])]
df_px_ = df_px.sort_values(by='P(Death) at 5y').reset_index().reset_index(names=['Percentile']).set_index('index')
df_px_['Percentile'] = df_px_['Percentile'] / len(df_px_['Percentile'])
df2 = df.join(df_px_[['Percentile']])
# from utils.alma_plot import *
# plot_alma(df2, save_html=False)
# from utils.alma_plot2 import *
# df_px_ = df_px.sort_values(by='38CpG-AMLsignature').reset_index().reset_index(names=['Percentile']).set_index('index')
# df_px_['Percentile'] = df_px_['Percentile'] / len(df_px_['Percentile'])
# df3 = df.join(df_px_[['Percentile']])
# plot_alma(df3, save_html=False)
from utils.alma_plot import *
plot_alma(df2, save_html=False)
# from utils.alma_plot2 import *
# df_px_ = df_px.sort_values(by='38CpG-AMLsignature').reset_index().reset_index(names=['Percentile']).set_index('index')
# df_px_['Percentile'] = df_px_['Percentile'] / len(df_px_['Percentile'])
# df3 = df.join(df_px_[['Percentile']])
# plot_alma(df3, save_html=False)
Patient Characteristics#
ALMA (unsupervised)#
Show code cell source
from tableone import TableOne
from datetime import date
columns = ['Hematopoietic Entity','Age (group years)','Sex',
'Clinical Trial',]
mytable_cog = TableOne(df_train.reset_index(), columns,
overall=False, missing=False,
pval=False, pval_adjust=False,
htest_name=True,dip_test=True,
tukey_test=True, normal_test=True,
order={'FLT3 ITD':['Yes','No'],
'Age (group years)':['0-5','5-13','13-39','39-60'],
'MRD 1 Status': ['Positive'],
'Risk Group': ['High Risk', 'Standard Risk'],
'FLT3 ITD': ['Yes'],
'Leucocyte counts (10⁹/L)': ['≥30'],
'Age group (years)': ['≥10']})
mytable_cog.to_excel('../data/pt_characteristics_alma_model_' + str(date.today()) +'.xlsx')
mytable_cog.tabulate(tablefmt="html",
# headers=[score_name,"",'Missing','Discovery','Validation','p-value','Statistical Test']
)
Show code cell output
| Overall | ||
|---|---|---|
| n | 3314 | |
| Hematopoietic Entity, n (%) | Acute lymphoblastic leukemia (ALL) | 700 (28.3) |
| Acute myeloid leukemia (AML) | 1221 (49.4) | |
| Acute promyelocytic leukemia (APL) | 31 (1.3) | |
| Mixed phenotype acute leukemia (MPAL) | 48 (1.9) | |
| Myelodysplastic syndrome (MDS or MDS-like) | 223 (9.0) | |
| Otherwise-Normal (Control) | 251 (10.1) | |
| Age (group years), n (%) | 0-5 | 480 (24.1) |
| 5-13 | 483 (24.2) | |
| 13-39 | 663 (33.2) | |
| 39-60 | 165 (8.3) | |
| 60+ | 203 (10.2) | |
| Sex, n (%) | Female | 885 (49.1) |
| Male | 918 (50.9) | |
| Clinical Trial, n (%) | AAML03P1 | 72 (2.2) |
| AAML0531 | 628 (18.9) | |
| AAML1031 | 587 (17.7) | |
| BM normal AAML0531 | 41 (1.2) | |
| Beat AML Consortium | 316 (9.5) | |
| CCG2961 | 41 (1.2) | |
| CETLAM SMD-09 (MDS-tAML) | 166 (5.0) | |
| French GRAALL 2003–2005 | 141 (4.3) | |
| Japanese AML05 | 64 (1.9) | |
| NOPHO ALL92-2000 | 933 (28.2) | |
| TARGET ALL | 131 (4.0) | |
| TCGA AML | 194 (5.9) |
Fine-tuned (supervised) Dx and Px models#
Show code cell source
columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
'Hispanic or Latino ethnic group', 'MRD 1 Status',
'Leucocyte counts (10⁹/L)', 'BM leukemic blasts (%)',
'Risk Group','FLT3 ITD', 'Clinical Trial']
df_test['Age (years)'] = df_test['Age (years)'].astype(float)
# join discovery clinical data with validation clinical data
all_cohorts = pd.concat([df_dx, df_px2, df_test],
axis=0, keys=['Dx Discovery','Px Discovery' ,'Validation'],
names=['cohort']).reset_index()
# columns = ['Age group (years)','Sex', 'MRD 1 Status',
# 'Leucocyte counts (10⁹/L)',
# 'Risk Group','FLT3 ITD', 'Treatment Arm','Clinical Trial']
mytable_cog = TableOne(all_cohorts, columns,
overall=False, missing=False,
pval=False, pval_adjust=False,
htest_name=True,dip_test=True,
tukey_test=True, normal_test=True,
order={'FLT3 ITD':['Yes','No'],
'Race or ethnic group':['White','Black or African American','Asian'],
'MRD 1 Status': ['Positive'],
'Risk Group': ['High Risk', 'Standard Risk'],
'FLT3 ITD': ['Yes'],
'Leucocyte counts (10⁹/L)': ['≥30'],
'Age group (years)': ['≥10']},
groupby='cohort')
mytable_cog.to_excel('../data/pt_characteristics_fine-tuned_models_' + str(date.today()) +'.xlsx')
mytable_cog.tabulate(tablefmt="html",
# headers=[score_name,"",score_name,'Validation','p-value','Statistical Test']
)
Show code cell output
| Dx Discovery | Px Discovery | Validation | ||
|---|---|---|---|---|
| n | 2471 | 946 | 200 | |
| Age (years), mean (SD) | 19.2 (19.7) | 9.4 (6.3) | 8.8 (6.0) | |
| Age group (years), n (%) | ≥10 | 528 (47.4) | 463 (48.9) | 95 (48.0) |
| <10 | 586 (52.6) | 483 (51.1) | 103 (52.0) | |
| Sex, n (%) | Female | 711 (50.5) | 468 (49.5) | 86 (43.0) |
| Male | 697 (49.5) | 478 (50.5) | 114 (57.0) | |
| Race or ethnic group, n (%) | White | 1064 (80.5) | 697 (79.1) | 142 (71.7) |
| Black or African American | 131 (9.9) | 102 (11.6) | 32 (16.2) | |
| Asian | 65 (4.9) | 43 (4.9) | 1 (0.5) | |
| American Indian or Alaska Native | 7 (0.5) | 5 (0.6) | ||
| Other | 48 (3.6) | 28 (3.2) | 21 (10.6) | |
| Pacific Islander | 7 (0.5) | 6 (0.7) | 2 (1.0) | |
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 209 (19.6) | 185 (20.2) | 25 (12.6) |
| Not Hispanic or Latino | 858 (80.4) | 731 (79.8) | 173 (87.4) | |
| MRD 1 Status, n (%) | Positive | 284 (29.6) | 260 (31.5) | 76 (40.4) |
| Negative | 675 (70.4) | 566 (68.5) | 112 (59.6) | |
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 579 (52.4) | 467 (49.4) | 87 (43.7) |
| <30 | 526 (47.6) | 479 (50.6) | 112 (56.3) | |
| BM leukemic blasts (%), mean (SD) | 65.7 (24.1) | 63.8 (24.5) | 60.2 (25.6) | |
| Risk Group, n (%) | High Risk | 198 (14.2) | 129 (13.8) | 51 (25.5) |
| Standard Risk | 628 (45.0) | 454 (48.7) | 86 (43.0) | |
| Low Risk | 570 (40.8) | 349 (37.4) | 63 (31.5) | |
| FLT3 ITD, n (%) | Yes | 180 (16.2) | 165 (17.5) | 31 (15.7) |
| No | 932 (83.8) | 779 (82.5) | 167 (84.3) | |
| Clinical Trial, n (%) | AAML03P1 | 62 (2.5) | 36 (3.8) | |
| AAML0531 | 517 (20.9) | 507 (53.6) | ||
| AAML1031 | 495 (20.0) | 403 (42.6) | ||
| BM normal AAML0531 | 41 (1.7) | |||
| Beat AML Consortium | 192 (7.8) | |||
| CCG2961 | 31 (1.3) | |||
| CETLAM SMD-09 (MDS-tAML) | 166 (6.7) | |||
| French GRAALL 2003–2005 | 141 (5.7) | |||
| Japanese AML05 | 9 (0.4) | |||
| NOPHO ALL92-2000 | 641 (25.9) | |||
| TARGET ALL | 56 (2.3) | |||
| TCGA AML | 120 (4.9) | |||
| AML02 | 158 (79.0) | |||
| AML08 | 42 (21.0) |
By prognostic group#
Discovery#
AML Epigenomic Risk
Show code cell source
def pt_characteristics_by_model(df, model_name, traintest = 'discovery'):
columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
'Hispanic or Latino ethnic group', 'MRD 1 Status',
'Leucocyte counts (10⁹/L)', 'BM leukemic blasts (%)',
'Risk Group', 'Clinical Trial','FLT3 ITD', 'Treatment Arm']
mytable_cog = TableOne(df, columns,
overall=False, missing=False,
pval=True, pval_adjust=False,
htest_name=True,dip_test=True,
tukey_test=True, normal_test=True,
order={'FLT3 ITD':['Yes','No'],
'Race or ethnic group':['White','Black or African American','Asian'],
'MRD 1 Status': ['Positive'],
'Risk Group': ['High Risk', 'Standard Risk'],
'FLT3 ITD': ['Yes'],
'Leucocyte counts (10⁹/L)': ['≥30'],
'Age group (years)': ['≥10']},
groupby=model_name)
mytable_cog.to_excel('../data/pt_characteristics_'+ model_name +'_' + traintest + '_' + str(date.today()) + '.xlsx')
return(mytable_cog.tabulate(tablefmt="html",
headers=[model_name + ' ' + traintest,"",'High','Low','p-value','Statistical Test']))
pt_characteristics_by_model(df_px2, model_name, 'Discovery')
Show code cell output
| AML Epigenomic Risk Discovery | High | Low | p-value | Statistical Test | |
|---|---|---|---|---|---|
| n | 442 | 504 | |||
| Age (years), mean (SD) | 8.7 (6.5) | 10.0 (6.2) | 0.002 | Two Sample T-test | |
| Age group (years), n (%) | ≥10 | 200 (45.2) | 263 (52.2) | 0.039 | Chi-squared |
| <10 | 242 (54.8) | 241 (47.8) | |||
| Sex, n (%) | Female | 215 (48.6) | 253 (50.2) | 0.680 | Chi-squared |
| Male | 227 (51.4) | 251 (49.8) | |||
| Race or ethnic group, n (%) | White | 323 (78.2) | 374 (79.9) | 0.971 | Chi-squared (warning: expected count < 5) |
| Black or African American | 52 (12.6) | 50 (10.7) | |||
| Asian | 20 (4.8) | 23 (4.9) | |||
| American Indian or Alaska Native | 2 (0.5) | 3 (0.6) | |||
| Other | 13 (3.1) | 15 (3.2) | |||
| Pacific Islander | 3 (0.7) | 3 (0.6) | |||
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 84 (19.5) | 101 (20.8) | 0.699 | Chi-squared |
| Not Hispanic or Latino | 346 (80.5) | 385 (79.2) | |||
| MRD 1 Status, n (%) | Positive | 158 (40.9) | 102 (23.2) | <0.001 | Chi-squared |
| Negative | 228 (59.1) | 338 (76.8) | |||
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 190 (43.0) | 277 (55.0) | <0.001 | Chi-squared |
| <30 | 252 (57.0) | 227 (45.0) | |||
| BM leukemic blasts (%), mean (SD) | 65.5 (26.2) | 62.2 (22.9) | 0.050 | Two Sample T-test | |
| Risk Group, n (%) | High Risk | 90 (20.8) | 39 (7.8) | <0.001 | Chi-squared |
| Standard Risk | 310 (71.8) | 144 (28.8) | |||
| Low Risk | 32 (7.4) | 317 (63.4) | |||
| Clinical Trial, n (%) | AAML03P1 | 19 (4.3) | 17 (3.4) | 0.017 | Chi-squared |
| AAML0531 | 215 (48.6) | 292 (57.9) | |||
| AAML1031 | 208 (47.1) | 195 (38.7) | |||
| FLT3 ITD, n (%) | Yes | 88 (20.0) | 77 (15.3) | 0.074 | Chi-squared |
| No | 353 (80.0) | 426 (84.7) | |||
| Treatment Arm, n (%) | Arm A | 110 (47.0) | 148 (48.1) | 0.878 | Chi-squared |
| Arm B | 124 (53.0) | 160 (51.9) |
38CpG-AMLsignature-37CpGs
Show code cell source
pt_characteristics_by_model(df_px2, model_name='38CpG-AMLsignature Categorical', traintest='Discovery')
Show code cell output
| 38CpG-AMLsignature Categorical Discovery | High | Low | p-value | Statistical Test | |
|---|---|---|---|---|---|
| n | 473 | 473 | |||
| Age (years), mean (SD) | 8.8 (6.6) | 10.0 (6.0) | 0.003 | Two Sample T-test | |
| Age group (years), n (%) | ≥10 | 215 (45.5) | 248 (52.4) | 0.037 | Chi-squared |
| <10 | 258 (54.5) | 225 (47.6) | |||
| Sex, n (%) | Female | 241 (51.0) | 227 (48.0) | 0.398 | Chi-squared |
| Male | 232 (49.0) | 246 (52.0) | |||
| Race or ethnic group, n (%) | White | 341 (78.0) | 356 (80.2) | 0.130 | Chi-squared (warning: expected count < 5) |
| Black or African American | 59 (13.5) | 43 (9.7) | |||
| Asian | 24 (5.5) | 19 (4.3) | |||
| American Indian or Alaska Native | 2 (0.5) | 3 (0.7) | |||
| Other | 10 (2.3) | 18 (4.1) | |||
| Pacific Islander | 1 (0.2) | 5 (1.1) | |||
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 82 (18.0) | 103 (22.4) | 0.114 | Chi-squared |
| Not Hispanic or Latino | 374 (82.0) | 357 (77.6) | |||
| MRD 1 Status, n (%) | Positive | 158 (38.7) | 102 (24.4) | <0.001 | Chi-squared |
| Negative | 250 (61.3) | 316 (75.6) | |||
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 214 (45.2) | 253 (53.5) | 0.013 | Chi-squared |
| <30 | 259 (54.8) | 220 (46.5) | |||
| BM leukemic blasts (%), mean (SD) | 66.2 (25.1) | 61.4 (23.8) | 0.004 | Two Sample T-test | |
| Risk Group, n (%) | High Risk | 85 (18.3) | 44 (9.4) | <0.001 | Chi-squared |
| Standard Risk | 323 (69.6) | 131 (28.0) | |||
| Low Risk | 56 (12.1) | 293 (62.6) | |||
| Clinical Trial, n (%) | AAML03P1 | 18 (3.8) | 18 (3.8) | 0.375 | Chi-squared |
| AAML0531 | 264 (55.8) | 243 (51.4) | |||
| AAML1031 | 191 (40.4) | 212 (44.8) | |||
| FLT3 ITD, n (%) | Yes | 97 (20.6) | 68 (14.4) | 0.016 | Chi-squared |
| No | 375 (79.4) | 404 (85.6) | |||
| Treatment Arm, n (%) | Arm A | 133 (47.3) | 125 (47.9) | 0.964 | Chi-squared |
| Arm B | 148 (52.7) | 136 (52.1) |
Validation#
AML Epigenomic Risk
Show code cell source
pt_characteristics_by_model(df_test, model_name, 'validation')
Show code cell output
| AML Epigenomic Risk validation | High | Low | p-value | Statistical Test | |
|---|---|---|---|---|---|
| n | 88 | 112 | |||
| Age (years), mean (SD) | 7.9 (6.1) | 9.4 (5.8) | 0.083 | Two Sample T-test | |
| Age group (years), n (%) | ≥10 | 35 (40.7) | 60 (53.6) | 0.098 | Chi-squared |
| <10 | 51 (59.3) | 52 (46.4) | |||
| Sex, n (%) | Female | 38 (43.2) | 48 (42.9) | 1.000 | Chi-squared |
| Male | 50 (56.8) | 64 (57.1) | |||
| Race or ethnic group, n (%) | White | 63 (73.3) | 79 (70.5) | 0.688 | Chi-squared (warning: expected count < 5) |
| Black or African American | 14 (16.3) | 18 (16.1) | |||
| Asian | 1 (1.2) | ||||
| Other | 7 (8.1) | 14 (12.5) | |||
| Pacific Islander | 1 (1.2) | 1 (0.9) | |||
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 15 (17.4) | 10 (8.9) | 0.116 | Chi-squared |
| Not Hispanic or Latino | 71 (82.6) | 102 (91.1) | |||
| MRD 1 Status, n (%) | Positive | 42 (49.4) | 34 (33.0) | 0.033 | Chi-squared |
| Negative | 43 (50.6) | 69 (67.0) | |||
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 34 (39.1) | 53 (47.3) | 0.308 | Chi-squared |
| <30 | 53 (60.9) | 59 (52.7) | |||
| BM leukemic blasts (%), mean (SD) | 67.8 (25.6) | 54.1 (24.1) | <0.001 | Two Sample T-test | |
| Risk Group, n (%) | High Risk | 31 (35.2) | 20 (17.9) | <0.001 | Chi-squared |
| Standard Risk | 47 (53.4) | 39 (34.8) | |||
| Low Risk | 10 (11.4) | 53 (47.3) | |||
| Clinical Trial, n (%) | AML02 | 71 (80.7) | 87 (77.7) | 0.732 | Chi-squared |
| AML08 | 17 (19.3) | 25 (22.3) | |||
| FLT3 ITD, n (%) | Yes | 16 (18.4) | 15 (13.5) | 0.459 | Chi-squared |
| No | 71 (81.6) | 96 (86.5) | |||
| Treatment Arm, n (%) | Arm A | 46 (53.5) | 60 (53.6) | 1.000 | Chi-squared |
| Arm B | 40 (46.5) | 52 (46.4) |
38CpG-AMLsignature-37CpGs
Show code cell source
pt_characteristics_by_model(df_test, model_name='38CpG-AMLsignature Categorical', traintest='Validation')
Show code cell output
| 38CpG-AMLsignature Categorical Validation | High | Low | p-value | Statistical Test | |
|---|---|---|---|---|---|
| n | 111 | 89 | |||
| Age (years), mean (SD) | 7.9 (6.2) | 9.9 (5.6) | 0.020 | Two Sample T-test | |
| Age group (years), n (%) | ≥10 | 46 (41.8) | 49 (55.7) | 0.072 | Chi-squared |
| <10 | 64 (58.2) | 39 (44.3) | |||
| Sex, n (%) | Female | 50 (45.0) | 36 (40.4) | 0.611 | Chi-squared |
| Male | 61 (55.0) | 53 (59.6) | |||
| Race or ethnic group, n (%) | White | 79 (72.5) | 63 (70.8) | 0.854 | Chi-squared (warning: expected count < 5) |
| Black or African American | 18 (16.5) | 14 (15.7) | |||
| Asian | 1 (0.9) | ||||
| Other | 10 (9.2) | 11 (12.4) | |||
| Pacific Islander | 1 (0.9) | 1 (1.1) | |||
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 14 (12.7) | 11 (12.5) | 1.000 | Chi-squared |
| Not Hispanic or Latino | 96 (87.3) | 77 (87.5) | |||
| MRD 1 Status, n (%) | Positive | 50 (48.1) | 26 (31.0) | 0.026 | Chi-squared |
| Negative | 54 (51.9) | 58 (69.0) | |||
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 48 (43.6) | 39 (43.8) | 1.000 | Chi-squared |
| <30 | 62 (56.4) | 50 (56.2) | |||
| BM leukemic blasts (%), mean (SD) | 62.3 (27.9) | 57.7 (22.5) | 0.227 | Two Sample T-test | |
| Risk Group, n (%) | High Risk | 38 (34.2) | 13 (14.6) | <0.001 | Chi-squared |
| Standard Risk | 59 (53.2) | 27 (30.3) | |||
| Low Risk | 14 (12.6) | 49 (55.1) | |||
| Clinical Trial, n (%) | AML02 | 86 (77.5) | 72 (80.9) | 0.678 | Chi-squared |
| AML08 | 25 (22.5) | 17 (19.1) | |||
| FLT3 ITD, n (%) | Yes | 20 (18.2) | 11 (12.5) | 0.370 | Chi-squared |
| No | 90 (81.8) | 77 (87.5) | |||
| Treatment Arm, n (%) | Arm A | 62 (56.9) | 44 (49.4) | 0.367 | Chi-squared |
| Arm B | 47 (43.1) | 45 (50.6) |
Kaplan-Meier Plots#
Overall study population#
AML Epigenomic Risk
Show code cell source
for dataset, trial in zip([df_px2, df_test],
['Discovery', 'Validation']):
draw_kaplan_meier(model_name=model_name,
df=dataset,
save_survival_table=False,
save_plot=False,
show_ci=False,
add_risk_counts=False,
trialname=trial,
figsize=(8,8))
Show code cell output
38CpG-AMLsignature
Show code cell source
for dataset, trial in zip([df_px2, df_test],
['Discovery', 'Validation']):
draw_kaplan_meier(model_name='38CpG-AMLsignature Categorical',
df=dataset,
save_survival_table=False,
save_plot=False,
show_ci=False,
add_risk_counts=False,
trialname=trial,
figsize=(8,8))
Show code cell output
Per risk group#
AML Epigenomic Risk
Show code cell source
for dataset, trial in zip([df_px2, df_test], ['Discovery', 'Validation']):
risk_groups = ['High Risk', 'Low Risk', 'Standard Risk']
for risk_group in risk_groups:
draw_kaplan_meier(
model_name=model_name,
df=dataset[dataset['Risk Group'] == risk_group],
save_plot=False,
save_survival_table=False,
add_risk_counts=False,
trialname=f'{trial} {risk_group}',
figsize=(8, 8))
Show code cell output
38CpG-AMLsignature-37CpGs
Show code cell source
for dataset, trial in zip([df_px2, df_test], ['Discovery', 'Validation']):
risk_groups = ['High Risk', 'Low Risk', 'Standard Risk']
for risk_group in risk_groups:
draw_kaplan_meier(
model_name= '38CpG-AMLsignature Categorical',
df=dataset[dataset['Risk Group'] == risk_group],
save_plot=False,
save_survival_table=False,
add_risk_counts=False,
trialname=f'{trial} {risk_group}',
figsize=(8, 8))
Show code cell output
Per risk group (AAML1831 COG)#
AML Epigenomic Risk
Show code cell source
for dataset, trial in zip([df_px2],['Discovery']):
risk_groups = ['High', 'Low', 'Standard']
for risk_group in risk_groups:
draw_kaplan_meier(
model_name=model_name,
df=dataset[dataset['Risk Group AAML1831'] == risk_group],
save_plot=False,
save_survival_table=False,
add_risk_counts=False,
trialname=f'{trial} {risk_group} Risk',
figsize=(8, 8))
Show code cell output
38CpG-AMLsignature-37CpGs
Show code cell source
for dataset, trial in zip([df_px2],['Discovery']):
risk_groups = ['High', 'Low', 'Standard']
for risk_group in risk_groups:
draw_kaplan_meier(
model_name='38CpG-AMLsignature Categorical',
df=dataset[dataset['Risk Group AAML1831'] == risk_group],
save_plot=False,
save_survival_table=False,
add_risk_counts=False,
trialname=f'{trial} {risk_group} Risk',
figsize=(8, 8))
Show code cell output
Forest Plots#
With MRD 1 and BM blast (%)#
AML Epigenomic Risk
Show code cell source
for dataset, trial in zip([df_px2, df_test], ['Discovery', 'Validation']):
df_ = dataset.copy()
df_['BM leukemic blasts (%)'] = pd.cut(df_['BM leukemic blasts (%)'], bins=[0,50,100], labels=['≤50', '>50'])
df_['AML_Epigenomic_Risk'] = df_['AML Epigenomic Risk']
df_['MethylScoreAML_Categorical'] = df_['38CpG-AMLsignature Categorical']
df_['os_time_5y'] = df_['os.time at 5y']
df_['os_evnt_5y'] = df_['os.evnt at 5y']
df_['efs_time_5y'] = df_['efs.time at 5y']
df_['efs_evnt_5y'] = df_['efs.evnt at 5y']
draw_forest_plot_withBMblast(time='os_time_5y',
event='os_evnt_5y',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
draw_forest_plot_withBMblast(time='efs_time_5y',
event='efs_evnt_5y',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
Show code cell output
38CpG-AMLsignature-37CpGs
Show code cell source
for dataset, trial in zip([df_px2, df_test], ['Discovery', 'Validation']):
draw_forest_plot_withBMblast(time='os_time_5y',
event='os_evnt_5y',
df=df_,
trialname=trial,
model_name='MethylScoreAML_Categorical',
save_plot=False)
draw_forest_plot_withBMblast(time='efs_time_5y',
event='efs_evnt_5y',
df=df_,
trialname=trial,
model_name='MethylScoreAML_Categorical',
save_plot=False)
Show code cell output
ROC AUC performance#
Diagnostic Model#
Show code cell source
def process_dataset_for_multiclass_auc(df):
# One hot encode `df_dx['AL Epigenomic Subtype']`
df_dx_dummies = pd.get_dummies(df['WHO 2022 Diagnosis'])
# transform boolean columns to integer
df_dx_dummies = df_dx_dummies.astype(int)
# join the one hot encoded columns with the original dataframe
df_dx_auc = pd.concat([df.iloc[:, -34:-6], df_dx_dummies], axis=1)
return df_dx_auc, df_dx_dummies
df_dx_auc_train, df_dx_dummies_train = process_dataset_for_multiclass_auc(df_dx)
df_dx_auc_cog, df_dx_dummies_cog = process_dataset_for_multiclass_auc(df_px2)
df_dx_auc_test, df_dx_dummies_test = process_dataset_for_multiclass_auc(df_test)
p1 = plot_multiclass_roc_auc(df_dx_auc_train, df_dx_dummies_train.columns, title='Discovery')
p2 = plot_multiclass_roc_auc(df_dx_auc_cog, df_dx_dummies_cog.columns, title='Discovery COG peds AML')
p3 = plot_multiclass_roc_auc(df_dx_auc_test, df_dx_dummies_test.columns, title='Validation')
# Create a gridplot
p = gridplot([
[p1, p2, p3,],
], toolbar_location='above')
show(p)
Show code cell output
Prognostic models#
Discovery#
Show code cell source
df_cat = df_px2[['os.evnt at 5y', '38CpG-AMLsignature Categorical', 'AML Epigenomic Risk']]
df_cont = df_px2[['os.evnt at 5y', '38CpG-AMLsignature', 'P(Death) at 5y']]
df_cont = df_cont.rename(columns={'P(Death) at 5y':'AML Epigenomic Risk (PaCMAP-LGBM)',
'38CpG-AMLsignature': '38CpG-AMLsignature (EWAS-CoxPH)'})
df_cat = df_cat.rename(columns={'AML Epigenomic Risk':'AML Epigenomic Risk (PaCMAP-LGBM)',
'38CpG-AMLsignature Categorical': '38CpG-AMLsignature (EWAS-CoxPH)'})
risk = df_px2[['Risk Group AAML1831','Risk Group']]
low_high_dict = {'Low': 0, 'Low Risk': 0,
'Standard':0.5, 'Standard Risk': 0.5,
'High': 1, 'High Risk': 1}
risk['Risk Group'] = risk['Risk Group'].map(low_high_dict)
risk['Risk Group AAML1831'] = risk['Risk Group AAML1831'].map(low_high_dict)
df_cat['AML Epigenomic Risk (PaCMAP-LGBM)'] = df_cat['AML Epigenomic Risk (PaCMAP-LGBM)'].map(low_high_dict)
df_cat['38CpG-AMLsignature (EWAS-CoxPH)'] = df_cat['38CpG-AMLsignature (EWAS-CoxPH)'].map(low_high_dict)
df_cont_risk = df_cont.join(risk)
df_cat_risk = df_cat.join(risk)
df_cont_risk = df_cont_risk.fillna(0.5)
df_cat_risk = df_cat_risk.fillna(0.5)
p1 = plot_roc_auc(df_cont_risk, 'os.evnt at 5y',title= 'Continuous (prob. of death at 5y)')
p2 = plot_roc_auc(df_cat_risk, 'os.evnt at 5y',title= 'Categorical (high-low risk)')
# Create a gridplot
p = gridplot([[p1, p2]], toolbar_location='above')
show(p)
Show code cell output
Validation#
Show code cell source
df_cat = df_test[['os.evnt at 5y', 'AML Epigenomic Risk', '38CpG-AMLsignature Categorical']]
df_cont = df_test[['os.evnt at 5y', 'P(Death) at 5y', '38CpG-AMLsignature']]
df_cont = df_cont.rename(columns={'P(Death) at 5y':'AML Epigenomic Risk (PaCMAP-LGBM)',
'38CpG-AMLsignature': '38CpG-AMLsignature (EWAS-CoxPH)'})
df_cat = df_cat.rename(columns={'AML Epigenomic Risk':'AML Epigenomic Risk (PaCMAP-LGBM)',
'38CpG-AMLsignature Categorical': '38CpG-AMLsignature (EWAS-CoxPH)'})
risk = df_test[['Risk Group']]
risk['Risk Group'] = risk['Risk Group'].map(low_high_dict)
df_cat['AML Epigenomic Risk (PaCMAP-LGBM)'] = df_cat['AML Epigenomic Risk (PaCMAP-LGBM)'].map(low_high_dict)
df_cat['38CpG-AMLsignature (EWAS-CoxPH)'] = df_cat['38CpG-AMLsignature (EWAS-CoxPH)'].map(low_high_dict)
df_cont_risk_test = df_cont.join(risk)
df_cat_risk_test = df_cat.join(risk)
# Rename `Risk Group` to `Risk Group AML02,08`
df_cont_risk_test = df_cont_risk_test.rename(columns={'Risk Group':'Risk Group AML02-08'})
df_cat_risk_test = df_cat_risk_test.rename(columns={'Risk Group':'Risk Group AML02-08'})
p1 = plot_roc_auc(df_cont_risk_test, 'os.evnt at 5y',title= 'Continuous (prob. of death at 5y)')
p2 = plot_roc_auc(df_cat_risk_test, 'os.evnt at 5y',title= 'Categorical (high-low risk)')
# Create a gridplot
p = gridplot([[p1, p2]], toolbar_location='above')
show(p)
Show code cell output
Pearson Correlation#
Discovery#
Show code cell source
draw_scatter_pearson(df=df_cont_risk,x='38CpG-AMLsignature (EWAS-CoxPH)', y='AML Epigenomic Risk (PaCMAP-LGBM)',s=20)
df_cont_risk.iloc[:,1:].corr().round(2)
Show code cell output
| 38CpG-AMLsignature (EWAS-CoxPH) | AML Epigenomic Risk (PaCMAP-LGBM) | Risk Group AAML1831 | Risk Group | |
|---|---|---|---|---|
| 38CpG-AMLsignature (EWAS-CoxPH) | 1.00 | 0.74 | 0.50 | 0.54 |
| AML Epigenomic Risk (PaCMAP-LGBM) | 0.74 | 1.00 | 0.53 | 0.59 |
| Risk Group AAML1831 | 0.50 | 0.53 | 1.00 | 0.62 |
| Risk Group | 0.54 | 0.59 | 0.62 | 1.00 |
Validation#
Show code cell source
draw_scatter_pearson(df=df_cont_risk_test,x='38CpG-AMLsignature (EWAS-CoxPH)', y='AML Epigenomic Risk (PaCMAP-LGBM)',s=20)
df_cont_risk_test.iloc[:,1:].corr().round(2)
Show code cell output
| AML Epigenomic Risk (PaCMAP-LGBM) | 38CpG-AMLsignature (EWAS-CoxPH) | Risk Group AML02-08 | |
|---|---|---|---|
| AML Epigenomic Risk (PaCMAP-LGBM) | 1.00 | 0.69 | 0.51 |
| 38CpG-AMLsignature (EWAS-CoxPH) | 0.69 | 1.00 | 0.46 |
| Risk Group AML02-08 | 0.51 | 0.46 | 1.00 |
Sankey plots#
Note
Sankey plots below compare the distribution of categories. The width of the lines is proportional to the number of patients in each group.
Samples with annotated diagnosis info#
Show code cell source
colors = get_custom_color_palette()
draw_sankey_plot(df_train, 'WHO 2022 Diagnosis', 'AL Epigenomic Subtype', colors,
title='Discovery cohort', fig_size=(4, 11),
fontsize=8, nan_action='drop')
draw_sankey_plot(df_px2, 'WHO 2022 Diagnosis', 'AL Epigenomic Subtype', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(4, 10),
fontsize=8, nan_action='drop')
draw_sankey_plot(df_test, 'WHO 2022 Diagnosis', 'AL Epigenomic Subtype', colors,
title= 'Validation cohort',fig_size=(3, 7),
fontsize=8, nan_action='drop')
Show code cell output
Predictions in samples for which no WHO 22 Dx data was available#
Show code cell source
draw_sankey_plot(df_train, 'WHO 2022 Diagnosis', 'AL Epigenomic Subtype', colors,
title='Discovery cohort', fig_size=(4, 9),
fontsize=8, nan_action='keep only')
draw_sankey_plot(df_px2, 'WHO 2022 Diagnosis', 'AL Epigenomic Subtype', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(4, 8),
fontsize=8, nan_action='keep only')
draw_sankey_plot(df_test, 'WHO 2022 Diagnosis', 'AL Epigenomic Subtype', colors,
title= 'Validation cohort',fig_size=(4, 8),
fontsize=8, nan_action='keep only')
Show code cell output
Reason for unclassified samples#
Show code cell source
draw_sankey_plot(df_train, 'WHO 2022 Diagnosis', 'Primary Cytogenetic Code', colors,
title='Discovery cohort', fig_size=(4, 6),
fontsize=8, nan_action='keep only')
draw_sankey_plot(df_px2, 'WHO 2022 Diagnosis', 'Gene Fusion', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(4, 9),
fontsize=8, nan_action='keep only')
draw_sankey_plot(df_test, 'WHO 2022 Diagnosis', 'Primary Cytogenetic Code', colors,
title= 'Validation cohort',fig_size=(2, 3),
fontsize=8, nan_action='keep only')
Show code cell output
Risk group comparison in COG#
Show code cell source
draw_sankey_plot(df_px2, 'Risk Group', 'Risk Group AAML1831', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(2, 4),
fontsize=8, nan_action='drop')
draw_sankey_plot(df_px2, 'Risk Group AAML1831', 'AML Epigenomic Risk', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(2, 4),
fontsize=8, nan_action='drop')
Show code cell output
Px and Dx model comparison#
Show code cell source
draw_sankey_plot(df_train, 'AML Epigenomic Risk', 'AL Epigenomic Subtype', colors,
title='Discovery cohort', fig_size=(3, 10),
fontsize=8, nan_action='drop')
draw_sankey_plot(df_px2, 'AML Epigenomic Risk', 'AL Epigenomic Subtype', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(3, 10),
fontsize=8, nan_action='drop')
draw_sankey_plot(df_test, 'AML Epigenomic Risk', 'AL Epigenomic Subtype', colors,
title= 'Validation cohort',fig_size=(3, 8),
fontsize=8, nan_action='drop')
Show code cell output
Nanopore test results#
Show code cell source
dfsank = df[df['Clinical Trial'] == 'UF HemBank']
dfsank = dfsank.reset_index()
# If values are repeated in dfsank['Dx at Acquisition], replace second value with empty string
dfsank['Dx at Acquisition'] = dfsank['Dx at Acquisition'].where(~dfsank['Dx at Acquisition'].duplicated(keep='last'), '')
dfsank['Diagnosis --> Patient Sample'] = dfsank['Dx at Acquisition'] + ' ' + dfsank['index']
draw_sankey_plot(dfsank, 'Diagnosis --> Patient Sample','AL Epigenomic Subtype',
title= 'Specimen-to-result Test Cohort',fig_size=(6, 8),
fontsize=11, nan_action='drop', colors=colors)
Show code cell output
Performance metrics#
AML Epigenomic Risk#
Show code cell source
plot_confusion_matrix_stacked(df_px2, df_test, 'os.evnt at 5y', 'AML Epigenomic Risk_int','os.evnt at 5y')
Show code cell output
Metrics:
| | Accuracy | Sensitivity | Specificity | Precision | F1-score | AUC-ROC |
|:-----------|-----------:|--------------:|--------------:|------------:|-----------:|----------:|
| Train | 0.704 | 0.74 | 0.684 | 0.566 | 0.641 | 0.712 |
| Validation | 0.7 | 0.733 | 0.686 | 0.5 | 0.595 | 0.71 |
38CpG-AMLsignature#
Show code cell source
plot_confusion_matrix_stacked(df_px2, df_test, 'os.evnt at 5y', '38CpG-AMLsignature_cat_bin','os.evnt at 5y')
Show code cell output
Metrics:
| | Accuracy | Sensitivity | Specificity | Precision | F1-score | AUC-ROC |
|:-----------|-----------:|--------------:|--------------:|------------:|-----------:|----------:|
| Train | 0.675 | 0.746 | 0.637 | 0.533 | 0.621 | 0.691 |
| Validation | 0.615 | 0.783 | 0.543 | 0.423 | 0.55 | 0.663 |
AL Epigenomic Subtype#
Show code cell source
plot_confusion_matrix_stacked(df_dx, df_test, 'WHO 2022 Diagnosis', 'AL Epigenomic Subtype', 'WHO 2022 Diagnosis', figsize=(22,14))
Show code cell output
Metrics:
| | Accuracy | Macro F1 | Weighted F1 | Cohen's Kappa |
|:-----------|-----------:|-----------:|--------------:|----------------:|
| Train | 0.963 | 0.948 | 0.963 | 0.96 |
| Validation | 0.901 | 0.46 | 0.94 | 0.859 |
Box plots#
AML Epigenomic Risk#
Show code cell source
draw_boxplot(df=df_test,x='Risk Group', y='P(Death) at 5y',
order=['High Risk', 'Standard Risk', 'Low Risk'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,4))
draw_boxplot(df=df_test,x='MRD 1 Status', y='P(Death) at 5y',
order=['Positive','Negative'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,4))
draw_boxplot(df=df_test,x='Primary Cytogenetic Code', y='P(Death) at 5y',
order='auto',
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,4))
Show code cell output
38CpG-AMLsignature#
Show code cell source
draw_boxplot(df=df_test,x='Risk Group', y='38CpG-AMLsignature',
order=['High Risk', 'Standard Risk', 'Low Risk'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,4))
draw_boxplot(df=df_test,x='MRD 1 Status', y='38CpG-AMLsignature',
order=['Positive','Negative'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,4))
draw_boxplot(df=df_test,x='Primary Cytogenetic Code', y='38CpG-AMLsignature',
order='auto',
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,4))
Show code cell output
Stacked bar plots#
AML Epigenomic Risk#
Show code cell source
model_name = 'AML Epigenomic Risk'
draw_stacked_barplot(df=df_test,x='MRD 1 Status', y=model_name,
order=['Positive','Negative'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,3))
draw_stacked_barplot(df=df_test,x='Risk Group', y=model_name,
order=['High Risk', 'Standard Risk', 'Low Risk'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,3), fontsize=9)
draw_stacked_barplot(df=df_test,x='Primary Cytogenetic Code', y=model_name,
order='auto',
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,3), fontsize=6)
Show code cell output
38CpG-AMLsignature#
Show code cell source
model_name = '38CpG-AMLsignature Categorical'
draw_stacked_barplot(df=df_test,x='MRD 1 Status', y=model_name,
order=['Positive','Negative'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,3))
draw_stacked_barplot(df=df_test,x='Risk Group', y=model_name,
order=['High Risk', 'Standard Risk', 'Low Risk'],
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,3), fontsize=9)
draw_stacked_barplot(df=df_test,x='Primary Cytogenetic Code', y=model_name,
order='auto',
trialname='Validation', hue=model_name,
save_plot=False, figsize=(4,3), fontsize=6)
Show code cell output
Watermark#
Author: Francisco_Marchi@Lamba_Lab_UF
Last updated: 2024-09-18
Python implementation: CPython
Python version : 3.10.13
IPython version : 8.27.0
pandas : 2.2.2
seaborn : 0.13.2
matplotlib: 3.9.2
tableone : 0.8.0
sklearn : 1.5.2
lifelines : 0.28.0
scipy : 1.12.0
Compiler : GCC 11.4.0
OS : Linux
Release : 5.15.133.1-microsoft-standard-WSL2
Machine : x86_64
Processor : x86_64
CPU cores : 32
Architecture: 64bit
Git repo: git@github.com:f-marchi/ALMA.git